{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GBM Hyperparameter Tuning with sklearn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports & Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from time import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from itertools import product\n",
    "from sklearn.externals import joblib\n",
    "from pathlib import Path\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create one-hot encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_one_hot_data(df, cols=('year', 'month', 'age', 'msize')):\n",
    "    cols = list(cols)\n",
    "    df = pd.get_dummies(df,\n",
    "                        columns=cols + ['sector'],\n",
    "                        prefix=cols + [''],\n",
    "                        prefix_sep=['_'] * len(cols) + [''])\n",
    "    return df.rename(columns={c: c.replace('.0', '').replace(' ', '_').lower() for c in df.columns})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create holdout test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_holdout_set(target, features, period=6):\n",
    "    idx = pd.IndexSlice\n",
    "    label = target.name\n",
    "    dates = np.sort(target.index.get_level_values('date').unique())\n",
    "    cv_start, cv_end = dates[0], dates[-period - 2]\n",
    "    holdout_start, holdout_end = dates[-period - 1], dates[-1]\n",
    "\n",
    "    df = features.join(target.to_frame())\n",
    "    train = df.loc[idx[:, cv_start: cv_end], :]\n",
    "    y_train, X_train = train[label], train.drop(label, axis=1)\n",
    "\n",
    "    test = df.loc[idx[:, holdout_start: holdout_end], :]\n",
    "    y_test, X_test = test[label], test.drop(label, axis=1)\n",
    "    return y_train, X_train, y_test, X_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom TimeSeriesSplit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "class OneStepTimeSeriesSplit:\n",
    "    \"\"\"Generates tuples of train_idx, test_idx pairs\n",
    "    Assumes the index contains a level labeled 'date'\"\"\"\n",
    "\n",
    "    def __init__(self, n_splits=3, test_period_length=1, shuffle=False):\n",
    "        self.n_splits = n_splits\n",
    "        self.test_period_length = test_period_length\n",
    "        self.shuffle = shuffle\n",
    "        self.test_end = n_splits * test_period_length\n",
    "\n",
    "    @staticmethod\n",
    "    def chunks(l, n):\n",
    "        for i in range(0, len(l), n):\n",
    "            yield l[i:i + n]\n",
    "\n",
    "    def split(self, X, y=None, groups=None):\n",
    "        unique_dates = (X\n",
    "                            .index\n",
    "                            .get_level_values('date')\n",
    "                            .unique()\n",
    "                            .sort_values(ascending=False)\n",
    "        [:self.test_end])\n",
    "\n",
    "        dates = X.reset_index()[['date']]\n",
    "        for test_date in self.chunks(unique_dates, self.test_period_length):\n",
    "            train_idx = dates[dates.date < min(test_date)].index\n",
    "            test_idx = dates[dates.date.isin(test_date)].index\n",
    "            if self.shuffle:\n",
    "                np.random.shuffle(list(train_idx))\n",
    "            yield train_idx, test_idx\n",
    "    \n",
    "    def get_n_splits(self, X, y, groups=None):\n",
    "        return self.n_splits            "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Instantiate GradientBoostingClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "gb_clf = GradientBoostingClassifier(loss='deviance',\n",
    "                                    learning_rate=0.1,\n",
    "                                    n_estimators=100,\n",
    "                                    subsample=1.0,\n",
    "                                    criterion='friedman_mse',\n",
    "                                    min_samples_split=2,\n",
    "                                    min_samples_leaf=1,\n",
    "                                    min_weight_fraction_leaf=0.0,\n",
    "                                    max_depth=3,\n",
    "                                    min_impurity_decrease=0.0,\n",
    "                                    min_impurity_split=None,\n",
    "                                    init=None,\n",
    "                                    random_state=None,\n",
    "                                    max_features=None,\n",
    "                                    verbose=0,\n",
    "                                    max_leaf_nodes=None,\n",
    "                                    warm_start=False,\n",
    "                                    presort='auto',\n",
    "                                    validation_fraction=0.1,\n",
    "                                    n_iter_no_change=None,\n",
    "                                    tol=0.0001)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use the dataset generated by the notebook [feature-engineering](../04_alpha_factor_research/00_data/feature_engineering.ipynb) from [Chapter 4 on Alpha Factor Research](../04_alpha_factor_research) that needs to be executed first."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_STORE = Path('../../data/assets.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data(start='2000', end='2018', holding_period=1, dropna=False):\n",
    "    idx = pd.IndexSlice\n",
    "    target = f'target_{holding_period}m'\n",
    "    with pd.HDFStore(DATA_STORE) as store:\n",
    "        df = store['engineered_features']\n",
    "\n",
    "    if start is not None and end is not None:\n",
    "        df = df.loc[idx[:, start: end], :]\n",
    "    if dropna:\n",
    "        df = df.dropna()\n",
    "\n",
    "    y = (df[target] > 0).astype(int)\n",
    "    X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)\n",
    "    return y, X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_splits = 12\n",
    "y, features = get_data()\n",
    "X = get_one_hot_data(features).dropna()\n",
    "\n",
    "y, X, y_test, X_test = get_holdout_set(target=y,\n",
    "                                       features=X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.HDFStore('model_tuning.h5') as store:\n",
    "    store.put('holdout/features', X_test)\n",
    "    store.put('holdout/target', y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup GridSearchCV"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) class in sklearn's [model_selection](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection) module facilitates the systematic evaluation of all combinations of the hyperparameter values that we would like to test.\n",
    " \n",
    "In the following code, we will illustrate this functionality for seven tuning parameters that will result in a total of 24 x 32 x 4 = 576 different model configurations."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parameter Grid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First we define the cross-validation iterator:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "cv = OneStepTimeSeriesSplit(n_splits=n_splits)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And next the parameter grid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = dict(\n",
    "        learning_rate=[.01, .1, .2],\n",
    "        max_depth=list(range(3, 13, 3)),\n",
    "        max_features=['sqrt', .8, 1],\n",
    "        min_impurity_decrease=[0, .01],\n",
    "        min_samples_split=[10, 50],\n",
    "        n_estimators=[100, 300],\n",
    "        subsample=[.8, 1],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Models = : 576\n"
     ]
    }
   ],
   "source": [
    "all_params = list(product(*param_grid.values()))\n",
    "print('# Models = :', len(all_params))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Instantiate GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gs = GridSearchCV(gb_clf,\n",
    "                  param_grid,\n",
    "                  cv=cv,\n",
    "                  scoring='roc_auc',\n",
    "                  verbose=3,\n",
    "                  n_jobs=-1,\n",
    "                  return_train_score=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fit GridSearchCV"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This can take several days..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time()\n",
    "gs.fit(X=X, y=y)\n",
    "done = time()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Persist Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'Done in {done:.2f}s')\n",
    "joblib.dump(gs, 'gbm_gridsearch.joblib')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
